# -*- coding: utf-8 -*-
import time

import scrapy
import json, re

class Lagou2Spider(scrapy.Spider):
    name = 'lagou2'
    allowed_domains = ['lagou.com']
    start_urls = ['https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=']
    headers = dict(Referer="https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=")


    def parse(self, response):
        form_data = dict(first='true', pn='1', kd='python')
        yield scrapy.FormRequest(
            'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false',
            formdata=form_data,
            headers=self.headers,
            callback=self.parse2,
            meta={'x':form_data}
        )

    def parse2(self, response):
        x = response.meta['x']
        if x['pn'] != 1:
            x['pn'] = str(int(x['pn']) + 1)
            x['first'] = 'false'
        print('当前url提交的表单是', x)
        json_d = json.loads(response.text)
        content_d = json_d["content"]
        info = content_d["positionResult"]["result"]  # 包含每一条数据的列表, 每一条数据是一个字典
        print(len(info), type(info))
        for data in info:
            item = data
        print('当前页面提取成功')
        if x['pn'] == '30':  # 只有30页
            return
        time.sleep(2)
        yield scrapy.FormRequest(
            'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false',
            headers=self.headers,
            formdata=x,
            callback=self.parse2,
            meta={'x': x}
        )



